notebook.community

Edit and run



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline



In [2]:

    
os.listdir(os.getcwd())









    Out[2]:





['.ipynb_checkpoints',
 'gender_submission.csv',
 'test.csv',
 'train.csv',
 '[Kaggle] Titanic.ipynb']



In [3]:

    
# load data in
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [4]:

    
train.head(5)









    Out[4]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [5]:

    
# initial look at the data
print(train.describe())
print(train.dtypes)









    



       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



In [6]:

    
# quite a fair bit of missing values
train.isnull().sum()









    Out[6]:





PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Cleaning



In [7]:

    
# start with sex
train.Sex.value_counts()









    Out[7]:





male      577
female    314
Name: Sex, dtype: int64



In [8]:

    
# convert sex to 1 (male) and 0 (female)
def sexconverter(row):
    if row['Sex'] == 'male':
        return 1
    else:
        return 0
train['Sex'] = train.apply(sexconverter, axis=1)



In [9]:

    
# only 7 observations of less than 1 year old
train[train.Age < 1]









    Out[9]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      78
      79
      1
      2
      Caldwell, Master. Alden Gates
      1
      0.83
      0
      2
      248738
      29.0000
      NaN
      S
    
    
      305
      306
      1
      1
      Allison, Master. Hudson Trevor
      1
      0.92
      1
      2
      113781
      151.5500
      C22 C26
      S
    
    
      469
      470
      1
      3
      Baclini, Miss. Helene Barbara
      0
      0.75
      2
      1
      2666
      19.2583
      NaN
      C
    
    
      644
      645
      1
      3
      Baclini, Miss. Eugenie
      0
      0.75
      2
      1
      2666
      19.2583
      NaN
      C
    
    
      755
      756
      1
      2
      Hamalainen, Master. Viljo
      1
      0.67
      1
      1
      250649
      14.5000
      NaN
      S
    
    
      803
      804
      1
      3
      Thomas, Master. Assad Alexander
      1
      0.42
      0
      1
      2625
      8.5167
      NaN
      C
    
    
      831
      832
      1
      2
      Richards, Master. George Sibley
      1
      0.83
      1
      1
      29106
      18.7500
      NaN
      S



In [10]:

    
# look at fare
# significant non-normality and right skewnewss
plt.figure(figsize=(10,10))
sns.distplot(train.Fare)









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x21dd7855908>



In [11]:

    
# fare and survival rate?
# already we can see that mainly men did not survive!
plt.figure(figsize=(10,10))
sns.swarmplot(x='Survived', y='Fare', hue='Sex', data=train)









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x21dd788d7f0>



In [12]:

    
# look at gender survival rates
tmp = pd.crosstab(index=train.Sex, columns=train.Survived, margins=True)
tmp



In [13]:

    
# frequency?
tmp_freq = pd.crosstab(index=train.Sex, columns=train.Survived, margins=True, normalize="index")
tmp_freq



In [14]:

    
# how about pclass and survival rates?
# use heatmap - mostly lower class people did not survive
plt.figure(figsize=(10,10))
tmp = pd.crosstab(index=train.Pclass, columns=[train.Survived, train.Sex])
sns.heatmap(tmp, cmap="plasma")









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x21dd8000940>



In [15]:

    
# how about age?
plt.figure(figsize=(10,10))
sns.violinplot(x="Survived", y="Age", data=train)
sns.swarmplot(x="Survived", y="Age", hue="Sex", alpha=0.5, data=train)









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x21dd7d9cef0>

Modeling

Simple Model: Gaussian Naive Bayes



In [34]:

    
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import numpy as np
from sklearn.model_selection import train_test_split



In [17]:

    
clf = GaussianNB()



In [ ]:

    
# impute for missing Age values
train['Age'] = [np.mean(train.Age) if np.isnan(x) == True else x for x in train.Age]



In [29]:

    
X = train[['Sex','Age','Pclass']].values
y= train[['Survived']].values



In [30]:

    
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)



In [32]:

    
clf.fit(X_train, y_train.ravel())









    Out[32]:





GaussianNB(priors=None)



In [40]:

    
acc = metrics.accuracy_score(y_test, clf.predict(X_test))
print("Accuracy of GNB model is %.2f%%" % (acc*100))









    



Accuracy of GNB model is 75.42%



In [41]:

    
# plot ROC curve
probs = clf.predict_proba(X_test)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr,tpr)



In [49]:

    
plt.figure(figsize=(15,15))
axis_font = {'fontname':'Arial', 'size':'22'}
plt.plot(fpr,tpr, 'b', label="ROC curve(area=%0.2f)" % roc_auc)
plt.plot([0,1],[0,1], "r--")
plt.xlabel("False Positive Rate",**axis_font)
plt.ylabel("True Positive Rate",**axis_font)
plt.legend(loc="lower right")









    Out[49]:





<matplotlib.legend.Legend at 0x21dd995cf98>

Survived	0	1
Sex
0	0.257962	0.742038
1	0.811092	0.188908
All	0.616162	0.383838

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
78	79	1	2	Caldwell, Master. Alden Gates	1	0.83	0	2	248738	29.0000	NaN	S
305	306	1	1	Allison, Master. Hudson Trevor	1	0.92	1	2	113781	151.5500	C22 C26	S
469	470	1	3	Baclini, Miss. Helene Barbara	0	0.75	2	1	2666	19.2583	NaN	C
644	645	1	3	Baclini, Miss. Eugenie	0	0.75	2	1	2666	19.2583	NaN	C
755	756	1	2	Hamalainen, Master. Viljo	1	0.67	1	1	250649	14.5000	NaN	S
803	804	1	3	Thomas, Master. Assad Alexander	1	0.42	0	1	2625	8.5167	NaN	C
831	832	1	2	Richards, Master. George Sibley	1	0.83	1	1	29106	18.7500	NaN	S